AMDGPU: Refactor lowering of s_barrier to split barriers #154648

nhaehnle · 2025-08-20T23:07:35Z

Let's do the lowering of non-split into split barriers in a new IR pass, AMDGPULowerIntrinsics. That way, there is no code duplication between SelectionDAG and GlobalISel. This simplifies some upcoming extensions to the code.

(cherry picked from commit e246f42fbdad5667d5a395ce65f4900d67610e72)

llvmbot · 2025-08-20T23:08:11Z

@llvm/pr-subscribers-backend-amdgpu

Author: Nicolai Hähnle (nhaehnle)

Changes

Let's do the lowering of non-split into split barriers in a new IR pass, AMDGPULowerIntrinsics. That way, there is no code duplication between SelectionDAG and GlobalISel. This simplifies some upcoming extensions to the code.

(cherry picked from commit e246f42fbdad5667d5a395ce65f4900d67610e72)

Patch is 29.84 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154648.diff

8 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPU.h (+11)
(modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (-12)
(added) llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp (+174)
(modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+4-1)
(modified) llvm/lib/Target/AMDGPU/CMakeLists.txt (+1)
(modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (-15)
(modified) llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll (+3-3)
(modified) llvm/test/CodeGen/AMDGPU/llc-pipeline.ll (+10)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 0059a862ba9b2..e3d4670af9e14 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -62,6 +62,7 @@ FunctionPass *createAMDGPURewriteOutArgumentsPass();
 ModulePass *
 createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
 ModulePass *createAMDGPULowerBufferFatPointersPass();
+FunctionPass *createAMDGPULowerIntrinsicsLegacyPass();
 FunctionPass *createSIModeRegisterPass();
 FunctionPass *createGCNPreRAOptimizationsLegacyPass();
 FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass();
@@ -153,6 +154,16 @@ struct AMDGPULowerBufferFatPointersPass
   const TargetMachine &TM;
 };
 
+void initializeAMDGPULowerIntrinsicsLegacyPass(PassRegistry &);
+
+struct AMDGPULowerIntrinsicsPass : PassInfoMixin<AMDGPULowerIntrinsicsPass> {
+  AMDGPULowerIntrinsicsPass(const TargetMachine &TM) : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+  const TargetMachine &TM;
+};
+
 void initializeAMDGPUPrepareAGPRAllocLegacyPass(PassRegistry &);
 extern char &AMDGPUPrepareAGPRAllocLegacyID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 5d31eed8fe7d7..eabf729d4853a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2007,18 +2007,6 @@ bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
     }
   }
 
-  if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
-    // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
-    MachineBasicBlock *MBB = MI.getParent();
-    const DebugLoc &DL = MI.getDebugLoc();
-    BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
-        .addImm(AMDGPU::Barrier::WORKGROUP);
-    BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
-        .addImm(AMDGPU::Barrier::WORKGROUP);
-    MI.eraseFromParent();
-    return true;
-  }
-
   return selectImpl(MI, *CoverageInfo);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
new file mode 100644
index 0000000000000..fe70edb9d18f4
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -0,0 +1,174 @@
+//===-- AMDGPULowerIntrinsics.cpp -------------------------------------------=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower intrinsics that would otherwise require separate handling in both
+// SelectionDAG and GlobalISel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "amdgpu-lower-intrinsics"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPULowerIntrinsicsImpl {
+public:
+  Function &F;
+  const AMDGPUTargetMachine &TM;
+  const GCNSubtarget &ST;
+  DominatorTree *DT = nullptr;
+  PostDominatorTree *PDT = nullptr;
+  LoopInfo *LI = nullptr;
+
+  AMDGPULowerIntrinsicsImpl(Function &F, const AMDGPUTargetMachine &TM)
+      : F(F), TM(TM), ST(TM.getSubtarget<GCNSubtarget>(F)) {}
+
+  bool run();
+
+private:
+  bool visit(Instruction *I);
+  void splitBarrier(IntrinsicInst &I);
+};
+
+class AMDGPULowerIntrinsicsLegacy : public FunctionPass {
+public:
+  static char ID;
+
+  AMDGPULowerIntrinsicsLegacy() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<PostDominatorTreeWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+  }
+};
+
+} // anonymous namespace
+
+bool AMDGPULowerIntrinsicsImpl::run() {
+  if (!ST.hasSplitBarriers())
+    return false;
+
+  bool Changed = false;
+
+  for (auto BBIt = F.begin(); BBIt != F.end(); ++BBIt) {
+    for (auto IIt = BBIt->begin(); IIt != BBIt->end();) {
+      Instruction *I = &*IIt;
+      ++IIt; // early increment in case the instruction is erased
+      if (I->isTerminator())
+        continue;
+
+      Changed |= visit(I);
+
+      // Fixup the basic block iterator in case the CFG was changed.
+      BBIt = IIt->getParent()->getIterator();
+    }
+  }
+
+  return Changed;
+}
+
+bool AMDGPULowerIntrinsicsImpl::visit(Instruction *I) {
+  if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::amdgcn_s_barrier:
+      if (ST.hasSplitBarriers()) {
+        splitBarrier(*II);
+        return true;
+      }
+      return false;
+    default:
+      return false;
+    }
+  }
+
+  return false;
+}
+
+// Lower s_{cluster_}barrier to a sequence of split barrier intrinsics.
+void AMDGPULowerIntrinsicsImpl::splitBarrier(IntrinsicInst &I) {
+  assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier);
+
+  IRBuilder<> B(&I);
+  unsigned WGMaxSize = ST.getFlatWorkGroupSizes(*I.getFunction()).second;
+  bool IsSingleWaveWG = WGMaxSize <= ST.getWavefrontSize();
+
+  if (IsSingleWaveWG) {
+    // The workgroup fits in a single wave. Downgrade to a wave barrier.
+    B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_wave_barrier, {});
+  } else {
+    // Lower to split barriers.
+    Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::WORKGROUP);
+    Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::WORKGROUP);
+    B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_signal,
+                      {BarrierID_32});
+    B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait,
+                      {BarrierID_16});
+  }
+
+  I.eraseFromParent();
+}
+
+PreservedAnalyses AMDGPULowerIntrinsicsPass::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  AMDGPULowerIntrinsicsImpl Impl(F,
+                                 static_cast<const AMDGPUTargetMachine &>(TM));
+  Impl.DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+  Impl.PDT = AM.getCachedResult<PostDominatorTreeAnalysis>(F);
+  Impl.LI = AM.getCachedResult<LoopAnalysis>(F);
+  if (!Impl.run())
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<PostDominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  return PA;
+}
+
+bool AMDGPULowerIntrinsicsLegacy::runOnFunction(Function &F) {
+  auto &TPC = getAnalysis<TargetPassConfig>();
+  const AMDGPUTargetMachine &TM = TPC.getTM<AMDGPUTargetMachine>();
+
+  AMDGPULowerIntrinsicsImpl Impl(F, TM);
+  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
+  auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+  Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  Impl.PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
+  Impl.LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
+  return Impl.run();
+}
+
+#define PASS_DESC "AMDGPU lower intrinsics"
+INITIALIZE_PASS_BEGIN(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false,
+                    false)
+
+char AMDGPULowerIntrinsicsLegacy::ID = 0;
+
+FunctionPass *llvm::createAMDGPULowerIntrinsicsLegacyPass() {
+  return new AMDGPULowerIntrinsicsLegacy;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e393aa1987744..d9d06d6814386 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -538,6 +538,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR);
   initializeAMDGPULowerModuleLDSLegacyPass(*PR);
   initializeAMDGPULowerBufferFatPointersPass(*PR);
+  initializeAMDGPULowerIntrinsicsLegacyPass(*PR);
   initializeAMDGPUReserveWWMRegsLegacyPass(*PR);
   initializeAMDGPURewriteAGPRCopyMFMALegacyPass(*PR);
   initializeAMDGPURewriteOutArgumentsPass(*PR);
@@ -1379,6 +1380,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
     // nodes out of the graph, which leads to function-level passes not
     // being run on them, which causes crashes in the resource usage analysis).
     addPass(createAMDGPULowerBufferFatPointersPass());
+    addPass(createAMDGPULowerIntrinsicsLegacyPass());
     // In accordance with the above FIXME, manually force all the
     // function-level passes into a CGSCCPassManager.
     addPass(new DummyCGSCCPass());
@@ -2116,9 +2118,10 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
   // nodes out of the graph, which leads to function-level passes not
   // being run on them, which causes crashes in the resource usage analysis).
   addPass(AMDGPULowerBufferFatPointersPass(TM));
-
   addPass.requireCGSCCOrder();
 
+  addPass(AMDGPULowerIntrinsicsPass(TM));
+
   Base::addCodeGenPrepare(addPass);
 
   if (isPassEnabled(EnableLoadStoreVectorizer))
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index dc9dd220130ea..619ff4e5c73c4 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -71,6 +71,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUImageIntrinsicOptimizer.cpp
   AMDGPULibFunc.cpp
   AMDGPULowerBufferFatPointers.cpp
+  AMDGPULowerIntrinsics.cpp
   AMDGPULowerKernelArguments.cpp
   AMDGPULowerKernelAttributes.cpp
   AMDGPULowerModuleLDSPass.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 561019bb65549..19e93742633ca 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10439,21 +10439,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
       }
     }
 
-    if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
-      // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
-      SDValue K =
-          DAG.getSignedTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
-      SDValue BarSignal =
-          SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
-                                     MVT::Other, K, Op.getOperand(0)),
-                  0);
-      SDValue BarWait =
-          SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
-                                     BarSignal.getValue(0)),
-                  0);
-      return BarWait;
-    }
-
     return SDValue();
   };
 
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index ceed41f3ed7c5..460e69622a652 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -8,11 +8,11 @@
 ; RUN:   | FileCheck -check-prefix=GCN-O3 %s
 
 
-; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(AMDGPULowerIntrinsicsPass,lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
 
-; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(AMDGPULowerIntrinsicsPass,codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-m...
[truncated]

llvm/lib/Target/AMDGPU/AMDGPU.h

llvm/test/CodeGen/AMDGPU/llc-pipeline.ll

llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp

arsenm · 2025-08-20T23:28:56Z

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

-    // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
-    MachineBasicBlock *MBB = MI.getParent();
-    const DebugLoc &DL = MI.getDebugLoc();
-    BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
-        .addImm(AMDGPU::Barrier::WORKGROUP);
-    BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
-        .addImm(AMDGPU::Barrier::WORKGROUP);
-    MI.eraseFromParent();


This isn't so much code that duplicating between globalisel and selection dag is a problem. Is a whole pass really needed for this one tiny case? Can we get this into one of the existing IR lowering passes (e.g. PreISelIntrinsicLowering)?

PreISelIntrinsicLowering doesn't handle target-specific intrinsics. So is it better to teach that pass how to call into target hooks to handle target intrinsics, or to have a target-specific pass like in this patch?

I've looked at this. We could perhaps add a hook to TargetTransformInfo, but it feels a bit out of place. PreISelIntrinsicLowering iterates over the functions of the module and looks at calls from there, which means that hooking anything into that pass would basically only re-use this loop over the functions of the module.

I can sort of see the point about breaking the module pass manager (although -- isn't it really the other way around? Breaking up a function pass manager hurts, but a function pass inside a module pass manager ought to be fairly unproblematic), and perhaps a larger point that iterating over all code just for this is a bit excessive.

So I'm going to change this into a module pass that iterates like PreISelIntrinsicLowering.

As for how much code this is: I ran into this precisely because it's about to become a whole lot more code in the near future. This change is simply trying to prepare upstream for that.

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

github-actions · 2025-08-22T00:28:23Z

✅ With the latest revision this PR passed the C/C++ code formatter.

nhaehnle · 2025-08-22T17:19:05Z

Addressed the review comments and rebased on main to fix a build issue.

nhaehnle · 2025-08-25T23:22:28Z

Gentle ping

Let's do the lowering of non-split into split barriers and the downgrading of barriers based on the workgroup size in a new IR pass, AMDGPULowerIntrinsics. That way, there is no code duplication between SelectionDAG and GlobalISel. This simplifies some upcoming extensions to the code. v2: - turn into a Module pass - also handle the downgrading of barriers for single-wave workgroups in the IR pass - add tests for the new pass (cherry picked from commit e246f42fbdad5667d5a395ce65f4900d67610e72)

rampitec

LGTM

arsenm · 2025-08-28T15:21:57Z

I don't think this justifies a new pass and I'd rather just duplicate the code between the selectors. If you really wanted the code savings it would make more sense as an isel pseudo

llvm-ci · 2025-08-28T17:00:49Z

LLVM Buildbot has detected a new failure on builder sanitizer-x86_64-linux-android running on sanitizer-buildbot-android while building llvm at step 2 "annotate".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/186/builds/11973

Here is the relevant piece of the build log for the reference

Step 2 (annotate) failure: 'python ../sanitizer_buildbot/sanitizers/zorg/buildbot/builders/sanitizers/buildbot_selector.py' (failure)
...
[       OK ] AddressSanitizer.AtoiAndFriendsOOBTest (2239 ms)
[ RUN      ] AddressSanitizer.HasFeatureAddressSanitizerTest
[       OK ] AddressSanitizer.HasFeatureAddressSanitizerTest (0 ms)
[ RUN      ] AddressSanitizer.CallocReturnsZeroMem
[       OK ] AddressSanitizer.CallocReturnsZeroMem (14 ms)
[ DISABLED ] AddressSanitizer.DISABLED_TSDTest
[ RUN      ] AddressSanitizer.IgnoreTest
[       OK ] AddressSanitizer.IgnoreTest (0 ms)
[ RUN      ] AddressSanitizer.SignalTest
[       OK ] AddressSanitizer.SignalTest (197 ms)
[ RUN      ] AddressSanitizer.ReallocTest
[       OK ] AddressSanitizer.ReallocTest (25 ms)
[ RUN      ] AddressSanitizer.WrongFreeTest
[       OK ] AddressSanitizer.WrongFreeTest (100 ms)
[ RUN      ] AddressSanitizer.LongJmpTest
[       OK ] AddressSanitizer.LongJmpTest (0 ms)
[ RUN      ] AddressSanitizer.ThreadStackReuseTest
[       OK ] AddressSanitizer.ThreadStackReuseTest (11 ms)
[ DISABLED ] AddressSanitizer.DISABLED_MemIntrinsicUnalignedAccessTest
[ DISABLED ] AddressSanitizer.DISABLED_LargeFunctionSymbolizeTest
[ DISABLED ] AddressSanitizer.DISABLED_MallocFreeUnwindAndSymbolizeTest
[ RUN      ] AddressSanitizer.UseThenFreeThenUseTest
[       OK ] AddressSanitizer.UseThenFreeThenUseTest (123 ms)
[ RUN      ] AddressSanitizer.FileNameInGlobalReportTest
[       OK ] AddressSanitizer.FileNameInGlobalReportTest (116 ms)
[ DISABLED ] AddressSanitizer.DISABLED_StressStackReuseAndExceptionsTest
[ RUN      ] AddressSanitizer.MlockTest
[       OK ] AddressSanitizer.MlockTest (0 ms)
[ DISABLED ] AddressSanitizer.DISABLED_DemoThreadedTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoStackTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoThreadStackTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoUAFLowIn
[ DISABLED ] AddressSanitizer.DISABLED_DemoUAFLowLeft
[ DISABLED ] AddressSanitizer.DISABLED_DemoUAFLowRight
[ DISABLED ] AddressSanitizer.DISABLED_DemoUAFHigh
[ DISABLED ] AddressSanitizer.DISABLED_DemoOOM
[ DISABLED ] AddressSanitizer.DISABLED_DemoDoubleFreeTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoNullDerefTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoFunctionStaticTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoTooMuchMemoryTest
[ RUN      ] AddressSanitizer.LongDoubleNegativeTest
[       OK ] AddressSanitizer.LongDoubleNegativeTest (0 ms)
[----------] 19 tests from AddressSanitizer (27916 ms total)

[----------] Global test environment tear-down
[==========] 22 tests from 2 test suites ran. (27920 ms total)
[  PASSED  ] 22 tests.

  YOU HAVE 1 DISABLED TEST

Step 24 (run instrumented asan tests [aarch64/aosp_coral-userdebug/AOSP.MASTER]) failure: run instrumented asan tests [aarch64/aosp_coral-userdebug/AOSP.MASTER] (failure)
...
[ RUN      ] AddressSanitizer.HasFeatureAddressSanitizerTest
[       OK ] AddressSanitizer.HasFeatureAddressSanitizerTest (0 ms)
[ RUN      ] AddressSanitizer.CallocReturnsZeroMem
[       OK ] AddressSanitizer.CallocReturnsZeroMem (8 ms)
[ DISABLED ] AddressSanitizer.DISABLED_TSDTest
[ RUN      ] AddressSanitizer.IgnoreTest
[       OK ] AddressSanitizer.IgnoreTest (0 ms)
[ RUN      ] AddressSanitizer.SignalTest
[       OK ] AddressSanitizer.SignalTest (325 ms)
[ RUN      ] AddressSanitizer.ReallocTest
[       OK ] AddressSanitizer.ReallocTest (24 ms)
[ RUN      ] AddressSanitizer.WrongFreeTest
[       OK ] AddressSanitizer.WrongFreeTest (248 ms)
[ RUN      ] AddressSanitizer.LongJmpTest
[       OK ] AddressSanitizer.LongJmpTest (0 ms)
[ RUN      ] AddressSanitizer.ThreadStackReuseTest
[       OK ] AddressSanitizer.ThreadStackReuseTest (16 ms)
[ DISABLED ] AddressSanitizer.DISABLED_MemIntrinsicUnalignedAccessTest
[ DISABLED ] AddressSanitizer.DISABLED_LargeFunctionSymbolizeTest
[ DISABLED ] AddressSanitizer.DISABLED_MallocFreeUnwindAndSymbolizeTest
[ RUN      ] AddressSanitizer.UseThenFreeThenUseTest
[       OK ] AddressSanitizer.UseThenFreeThenUseTest (303 ms)
[ RUN      ] AddressSanitizer.FileNameInGlobalReportTest
[       OK ] AddressSanitizer.FileNameInGlobalReportTest (298 ms)
[ DISABLED ] AddressSanitizer.DISABLED_StressStackReuseAndExceptionsTest
[ RUN      ] AddressSanitizer.MlockTest
[       OK ] AddressSanitizer.MlockTest (0 ms)
[ DISABLED ] AddressSanitizer.DISABLED_DemoThreadedTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoStackTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoThreadStackTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoUAFLowIn
[ DISABLED ] AddressSanitizer.DISABLED_DemoUAFLowLeft
[ DISABLED ] AddressSanitizer.DISABLED_DemoUAFLowRight
[ DISABLED ] AddressSanitizer.DISABLED_DemoUAFHigh
[ DISABLED ] AddressSanitizer.DISABLED_DemoOOM
[ DISABLED ] AddressSanitizer.DISABLED_DemoDoubleFreeTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoNullDerefTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoFunctionStaticTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoTooMuchMemoryTest
[ RUN      ] AddressSanitizer.LongDoubleNegativeTest
[       OK ] AddressSanitizer.LongDoubleNegativeTest (0 ms)
[----------] 19 tests from AddressSanitizer (71678 ms total)

[----------] Global test environment tear-down
[==========] 22 tests from 2 test suites ran. (71681 ms total)
[  PASSED  ] 22 tests.

  YOU HAVE 1 DISABLED TEST

Serial 17031FQCB00176

nhaehnle · 2025-09-02T23:49:12Z

I'm not a fan of doing ever more stuff in MachineIR. The tooling around IR is better, the compiler code working on it tends to be cleaner, and it's generally better understood by more people.

In this particular case for example, we have downstream changes that introduce CFG manipulation into this code. And yeah, we have plenty of code that does that in MachineIR, but I still think it's better to do that kind of thing in IR.

nhaehnle requested a review from rampitec August 20, 2025 23:07

llvmbot added the backend:AMDGPU label Aug 20, 2025

arsenm reviewed Aug 20, 2025

View reviewed changes

jayfoad reviewed Aug 21, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/SIISelLowering.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp Outdated Show resolved Hide resolved

nhaehnle force-pushed the pub-split-barriers branch from fb93c7e to 773769f Compare August 22, 2025 00:24

nhaehnle force-pushed the pub-split-barriers branch 3 times, most recently from 49f1318 to 6727443 Compare August 22, 2025 16:12

nhaehnle force-pushed the pub-split-barriers branch from 6727443 to de0e91f Compare August 26, 2025 19:05

rampitec approved these changes Aug 26, 2025

View reviewed changes

Stylistic change

56f898c

nhaehnle merged commit 353b5e4 into llvm:main Aug 28, 2025
9 checks passed

nhaehnle deleted the pub-split-barriers branch August 28, 2025 14:02

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

AMDGPU: Refactor lowering of s_barrier to split barriers #154648

AMDGPU: Refactor lowering of s_barrier to split barriers #154648

Uh oh!

nhaehnle commented Aug 20, 2025

Uh oh!

llvmbot commented Aug 20, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

arsenm Aug 20, 2025

Uh oh!

jayfoad Aug 21, 2025

Uh oh!

nhaehnle Aug 21, 2025 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

github-actions bot commented Aug 22, 2025 •

edited

Loading

Uh oh!

nhaehnle commented Aug 22, 2025

Uh oh!

nhaehnle commented Aug 25, 2025

Uh oh!

rampitec left a comment

Uh oh!

Uh oh!

arsenm commented Aug 28, 2025

Uh oh!

llvm-ci commented Aug 28, 2025

Uh oh!

nhaehnle commented Sep 2, 2025 •

edited

Loading

Uh oh!

Uh oh!

AMDGPU: Refactor lowering of s_barrier to split barriers #154648

AMDGPU: Refactor lowering of s_barrier to split barriers #154648

Uh oh!

Conversation

nhaehnle commented Aug 20, 2025

Uh oh!

llvmbot commented Aug 20, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

arsenm Aug 20, 2025

Choose a reason for hiding this comment

Uh oh!

jayfoad Aug 21, 2025

Choose a reason for hiding this comment

Uh oh!

nhaehnle Aug 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

github-actions bot commented Aug 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

nhaehnle commented Aug 22, 2025

Uh oh!

nhaehnle commented Aug 25, 2025

Uh oh!

rampitec left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

arsenm commented Aug 28, 2025

Uh oh!

llvm-ci commented Aug 28, 2025

Uh oh!

nhaehnle commented Sep 2, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

nhaehnle Aug 21, 2025 •

edited

Loading

github-actions bot commented Aug 22, 2025 •

edited

Loading

nhaehnle commented Sep 2, 2025 •

edited

Loading